Preprocessing of the data

I first read in the clinical and metabolomics data:

samp_rd <- read.xlsx("~/Documents/VIB/Projects/Integrative_Paris/Local_cohort/Data synthesis local cohort Saint-Louis 17012019.xlsx")
samp_rd$DOB <- as.Date(samp_rd$DOB, format = "%d.%m.%Y")
samp_rd$DOG <- as.Date(samp_rd$DOG, format = "%d.%m.%Y")
samp_rd$DATEOFSAMPLE <- as.Date(samp_rd$DATEOFSAMPLE, format = "%d.%m.%Y")

info <- extract_info_metabo(metadata_file = samp_rd,
                            metabo_file = "~/Documents/VIB/Projects/Integrative_Paris/Local_cohort/Metabo/Metabolomic local cohort Saint-Louis_filtered.xlsx")
rd_meta <- info$subset_meta
meta_metabo <- info$meta_metabo
data_metabolites <- info$data_metabolites

I generate the age and gender vectors that will be used for modeling in all patient groups (donors, recipients and couples). These vectors are : - gender_comp : the donor and recipient gender (“FM”, “FF”, “MF” or “MM”) - age_recip : the age of the recipient

##          COUPLENUMBER GENDER        DOG        DOB            GROUP
## D1_TOL              1      F       <NA> 1962-07-06     non_tolerant
## R1_2_TOL            1      F 2014-08-06 1954-12-05     non_tolerant
## D2_NoTOL            2      F       <NA> 1967-04-03     non_tolerant
## R2_NoTOL            2      M 2013-05-02 1953-12-16     non_tolerant
## D3_TOL              3      F       <NA> 1990-04-03 primary_tolerant
## R3_1_TOL            3      M 2014-01-02 1991-05-19 primary_tolerant
##          gender_comp age_recip
## D1_TOL            FF     21794
## R1_2_TOL          FF     21794
## D2_NoTOL          FM     21687
## R2_NoTOL          FM     21687
## D3_TOL            FM      8264
## R3_1_TOL          FM      8264

I then filter out the metabolies that are xenobiotics drugs, as well as the metabolites which are not common to both the St Louis and the Cryostem cohort:

data_metabolites <- data_metabolites[,-which((meta_metabo[2,]=="Xenobiotics")&(meta_metabo[1,]=="Drug"))] # rm drug xenobiotics
data_metabo_national <- read.xlsx("~/Documents/VIB/Projects/Integrative_Paris/National_cohort/Metabo/Metabo NATIONAL cohort CRYOSTEM.xlsx")
national_metabolites <- data_metabo_national$X1[-c(1,2)]
# remove metabolites that are not present in both the national and the St_Louis cohort :
data_metabolites <- data_metabolites[,which(colnames(data_metabolites) %in% national_metabolites)]

Then, I filter out the metabolites for which we have more than 50% of missing values in all our sub-groups (donors or recipients, non-tolerant, tolerant 1 or tolerant 2): These are the metabolites that we filter out:

I then replace the missing values by 1/2 of the smallest value measured for each metabolite plus some noise:

for(i in 1:ncol(data_metabo)){ #replace remaining NA by 1/2 min column value + noise
  x <- data_metabo[,i]
  to_replace <- data_metabo[is.na(x),i]
  with_noise <- jitter(rep(0.5*(min(as.numeric(x[-which(is.na(x))]))), length(to_replace)))
  data_metabo[is.na(x),i] <- with_noise
}

I then filter out the metabolites that do not vary enough across patients:

sds <- apply(data_metabo,2,sd)
plot(sort(sds), type = "l", ylim = c(0,10^7))
abline(h=quantile(sds, 0.23), col="red")

data_metabo <- data_metabo[,which(sds>=quantile(sds, 0.23))]
rnames <- rownames(data_metabo)
data_metabo <- apply(data_metabo,2,as.numeric)
rownames(data_metabo) <- rnames

And I finally log-transform, normalise and save the data :

mat2use <- merge.data.frame(as.data.frame(rd_meta[,2], row.names = rownames(rd_meta)),
                            data_metabo, by = "row.names") %>%
  tibble::column_to_rownames(var="Row.names")

logdata <- LogTransform(mat2use)
normdata <- Normalise(logdata$output, method = "median")
colnames(normdata$output) <- colnames(logdata$output)
norm_data <- normdata$output

# merge dataframes:
big_mat <- merge.data.frame(normdata$output, rd_meta, by = "row.names") %>%
  tibble::column_to_rownames("Row.names")

logdata <- logdata$output
meta_metabo <- meta_metabo[,which(as.character(meta_metabo[3,])%in%colnames(norm_data))] # only selected metabolites

Analysis on the recipients :

First, we focus only on the metabolomics data coming from the recipients, to see if differences in certain metabolites can explain tolerance in these patients.

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/norm_data.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/meta_metabo.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/big_mat.RData")
norm_data$Group <- as.factor(norm_data$Group)

samp_recip <- samp_rd[which(samp_rd$METABONAME %in% rownames(norm_data)),]

We can perform a principal components analysis on the recipients to see if we would already see some structure in the data:

pca_met <- prcomp(norm_data %>% select(-"Group"))

It looks like the first principal component is by far the most informative:

plot(pca_met)

I plot the FCS names of the recipients, for easier comparison of the metabolomics data with the results found in the CyTOF data. The groups already seem to be slightly separated on the first PC, with non tolerant recipients on the left and tolerant recipients more to the right. I also colored the recipients per CMV status, but there doesn’t seem to be a CMV positive group, as opposed to what was observed in the CyTOF data.

I try random forest models on all features of the recipients, to already roughly see if there is some information in the recipients data, before performing any feature selection :

## 
## Call:
##  randomForest(formula = Group ~ ., data = norm_data, mtry = 40) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 40
## 
##         OOB estimate of  error rate: 20.59%
## Confusion matrix:
##                    Non_Tolerant Primary_tolerant Secondary_tolerant
## Non_Tolerant                 21                0                  0
## Primary_tolerant              2                5                  0
## Secondary_tolerant            3                2                  1
##                    class.error
## Non_Tolerant         0.0000000
## Primary_tolerant     0.2857143
## Secondary_tolerant   0.8333333

The classification of the recipients into three groups already gives interesting results, since it seems like we can classify the non tolerant and primary tolerant patients quite accurately. However, only 2 out of the 6 secondary tolerant recipients were correctly classified here, so we will divide the classification problem in two. First, we try to classify between non tolerant and tolerant patients:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tmp, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 30
## 
##         OOB estimate of  error rate: 8.82%
## Confusion matrix:
##              non_tolerant tolerant class.error
## non_tolerant           20        1  0.04761905
## tolerant                2       11  0.15384615

Since the out of bag error on the model classifying tolerant from non tolerant recipients is quite low (and therefore the model is potenitally quite informative), we can have a closer look at the features that were most important in this model:

features_idx <- which(rf1$importance[,3]>=0.01)
features_of_interest <-
  rownames(rf1$importance)[features_idx]

subst.mds <- cmdscale(1 - rf1$proximity, eig=TRUE)
op <- par(pty="s")
pairs(cbind(norm_data[,names(features_idx)], subst.mds$points), cex=0.6, gap=0,
      col=c("red", "blue")[as.numeric(norm_data$group)],
      main="Predictors and MDS of Proximity Based on RandomForest")

We can also generate a volcano plot of the metabolites between tolerant and non tolerant recipients: I also plot boxplots of the “top metaolites” in the volcano plot.

mat2use <- norm_data
gr <- as.character(norm_data$group)
gr[which(gr!= "non_tolerant")] <- "tolerant"
mat2use$group <- as.factor(gr)

res <- TwoGroup(mat2use)
VolcanoPlot(res$output[,4], res$output[,2], cexlab = 0.6)

MetBoxPlots(mat2use, "glycocholate",cols = c("red", "blue"),main = "glycocholate")

MetBoxPlots(mat2use, "taurocholate",cols = c("red", "blue"),main = "taurocholate")

MetBoxPlots(mat2use, "dehydroisoandrosterone.sulfate..DHEA.S.",cols = c("red", "blue"),main = "dehydroisoandrosterone sulfate (DHEA-S)")

MetBoxPlots(mat2use, "androstenediol..3beta.17beta..disulfate..2.",cols = c("red", "blue"),main = "androstenediol (3beta,17beta) disulfate (2)")

Second, we can try to classify only primary and seconday tolerant recipients:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tol, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 30
## 
##         OOB estimate of  error rate: 53.85%
## Confusion matrix:
##                    primary_tolerant secondary_tolerant class.error
## primary_tolerant                  4                  3   0.4285714
## secondary_tolerant                4                  2   0.6666667

Volcano plots between primary and secondary tolerant patients:

mat2use <- norm_data[which(norm_data$group!="non_tolerant"),]

res <- TwoGroup(mat2use)
VolcanoPlot(res$output[,4], res$output[,2], cexlab = 0.6)

MetBoxPlots(mat2use, "N.methylproline",cols = c("blue","green"),main = "N-methylproline")

MetBoxPlots(mat2use, "chiro.inositol",cols = c("blue","green"),main = "chiro-inositol")

MetBoxPlots(mat2use, "X1.7.dimethylurate",cols = c("blue","green"),main = "1,7-dimethylurate")

MetBoxPlots(mat2use, "X1.methylurate",cols = c("blue","green"),main = "1-methylurate")

Feature Selection

Identifying metabolites that differ between tolerant and non tolerant recipients:

Run the permutation distribution, taking demographic variables into account: I first add the demographics information (on age and gender compatibiliry) to the data.

I then compute the permutation distribution for non vs tolerant recipients.

Visualise the resulting quantile values: In the first plot: quantile values of the models on metabolites one by one In the second plot: quantile values of the difference between group~ age+gender and group~ age+gender+metabolite

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/norm_data_TNT.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/perm_vals_TNT.RData")
par(mfrow=c(2,1))
barplot(unlist(map(perm_vals, 1)))
barplot(unlist(map(perm_vals, 2)))

saveRDS(norm_data, file = paste0(recip_path, "norm_data_TNT.RDS"))
saveRDS(perm_vals, file = paste0(recip_path, "perm_vals_TNT.RDS"))

I compute the median per group for every gene, to use it later in the graphs.

Threshold of 0.90

Selected metabolites for the recipients (with a threshold of 0.90), after computing permutation distributions for every metabolite:

# I first rearrange the nor data matrix (so that I don't have to add +1 to select features)

norm_data <- norm_data %>% 
  rownames_to_column("rn") %>% 
  select(-group, everything()) %>% 
  column_to_rownames("rn")
selected_ft_recip_qt <- select_features(perm_vals = perm_vals,
                                        norm_data = norm_data,
                                        threshold = 0.90,
                                        file_path = recip_path,
                                        file_name = "perm_val_TNT_90_thresh.xlsx") 
## Note: zip::zip() is deprecated, please use zip::zipr() instead
print(paste0(length(selected_ft_recip_qt),
             " metabolites were selected with a 0.90 threshold in the St Louis cohort."))
## [1] "191 metabolites were selected with a 0.90 threshold in the St Louis cohort."

We can see which of these metabolites were also selected in the Cryostem cohort to separate tolerant from non tolerant recipients:

## [1] "44 features were commonly found in both cohorts"

We can see how many of these features had the same behaviour in the two cohorts (ie, were over or underexpressed in the same group of patients)

## [1] "42 features out of the 44 common selected features have the same behaviour in both cohorts."
##  [1] "X1..1.enyl.palmitoyl..2.arachidonoyl.GPC..P.16.0.20.4.."
##  [2] "X1.arachidonoyl.GPE..20.4n6.."                          
##  [3] "X1.linoleoyl.2.arachidonoyl.GPC..18.2.20.4n6.."         
##  [4] "X1.palmitoyl.2.gamma.linolenoyl.GPC..16.0.18.3n6.."     
##  [5] "X1.palmitoylglycerol..16.0."                            
##  [6] "X1.stearoyl.2.arachidonoyl.GPC..18.0.20.4."             
##  [7] "X1.stearoyl.2.arachidonoyl.GPI..18.0.20.4."             
##  [8] "X3.methyl.2.oxovalerate"                                
##  [9] "X4.methyl.2.oxopentanoate"                              
## [10] "alanine"                                                
## [11] "androstenediol..3beta.17beta..monosulfate..1."          
## [12] "androsterone.sulfate"                                   
## [13] "asparagine"                                             
## [14] "beta.alanine"                                           
## [15] "choline.phosphate"                                      
## [16] "cysteine.glutathione.disulfide"                         
## [17] "dehydroisoandrosterone.sulfate..DHEA.S."                
## [18] "epiandrosterone.sulfate"                                
## [19] "gamma.glutamylalanine"                                  
## [20] "gamma.glutamylglutamine"                                
## [21] "glucuronate"                                            
## [22] "histidine"                                              
## [23] "hydroxycotinine"                                        
## [24] "indolelactate"                                          
## [25] "isoleucine"                                             
## [26] "leucine"                                                
## [27] "lysine"                                                 
## [28] "methionine"                                             
## [29] "methyl.4.hydroxybenzoate.sulfate"                       
## [30] "N.palmitoyl.heptadecasphingosine..d17.1.16.0.."         
## [31] "ornithine"                                              
## [32] "oxalate..ethanedioate."                                 
## [33] "proline"                                                
## [34] "serine"                                                 
## [35] "sphingosine.1.phosphate"                                
## [36] "theobromine"                                            
## [37] "threonate"                                              
## [38] "threonine"                                              
## [39] "trans.4.hydroxyproline"                                 
## [40] "tryptophan"                                             
## [41] "tyrosine"                                               
## [42] "valine"

Graph of the selected metabolites that are common between the two cohorts: The features that were correlated (ie expressed in similar ways among the recipients) are linked in the graph. The features that were more expressed in the non tolerant St Louis recipients are colored in red, while the features that were more expressed in tolerant recipients are colored in blue.

I save the common features in an excel file: The excel file contains information on which features were correlated in the graph above. It also contains, for every gene, the associated logFC, pvalue…, where the comparison is in non tolerant versus tolerant patients (eg, a positive Fold Change for a gene therefore means that the gene is overexpressed in non tolrant patients)

We can try to build a RF model based on these common features:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tmp, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 5.88%
## Confusion matrix:
##              non_tolerant tolerant class.error
## non_tolerant           20        1  0.04761905
## tolerant                1       12  0.07692308

The classification of the St Louis recipients into tolerant and non tolerant groups after feature selection is just as good as it already was before.

Including gender compatibility and age recip in the feature selection

Now that we have selected metabolites that seemed to play a role in tolerance that were common to both cohorts (with a threhold over 0.90), we decide to investigate into these metabolites to see if some of them are related to the age of the recipients, to the gender compatibility between the donor and recipient, to both or to none of these two causes.

I first build new models using the metabolites that were kept in the two cohorts as informative when comparing tolerant and non tolerant patients:

new_models <- lapply(seq_along(common_features$common_features), function(idx){
  dta_tmp <- norm_data[,c(common_features$common_features[idx],
                        "group", "age_recip", "gender_comp")]
  colnames(dta_tmp)[1] <- "Var"
  fit1 <- glm(group ~ Var, data=dta_tmp,trace=F, family = "binomial")
  fit2 <- glm(group ~ gender_comp + Var,
              data=dta_tmp,trace=F, family = "binomial")
  fit3 <- glm(group ~ age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  fit4 <- glm(group ~ gender_comp + age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  list(fit1, fit2, fit3, fit4)
})
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
plots_gender_age(pdf_name = "../plots/metabo/recip/age_and_gender/tol_nontol_0.9.pdf",
                 norm_data = norm_data,
                 features = common_features$common_features)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## quartz_off_screen 
##                 2
plots_gender_age(pdf_name = "../plots/metabo/recip/age_and_gender/tol_nontol_0.9_behavior.pdf",
                 norm_data = norm_data,
                 features = common_beh_features)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## quartz_off_screen 
##                 2

Looking at differences between primary and secondary tolerant recipients

I compute the permutation distribution for primary vs secondary tolerant recipients:

Visualise the resulting quantile values: In the first plot: quantile values of the models on metabolites one by one In the second plot: quantile values of the difference between group~ age+gender and group~ age+gender+metabolite

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/norm_data_PS.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/perm_vals_PS.RData")
par(mfrow=c(2,1))
barplot(unlist(map(perm_vals, 1)))
barplot(unlist(map(perm_vals, 2)))

saveRDS(norm_data, file = paste0(recip_path, "norm_data_PTST.RDS"))
saveRDS(perm_vals, file = paste0(recip_path, "perm_vals_PTST.RDS"))

Compute the median per group for every gene, to use it later in the graphs:

gr_medians <- norm_data %>% 
  group_by(group) %>% 
  summarize_if(is.numeric, median) %>% 
  as.data.frame()
rownames(gr_medians) <- gr_medians$group

Threshold of 0.90

Selected metabolites for the recipients (with a threshold of 0.90), after computing permutation distributions for every metabolite:

# I first rearrange the nor data matrix (so that I don't have to add +1 to select features)

norm_data <- norm_data %>% 
  rownames_to_column("rn") %>% 
  select(-group, everything()) %>% 
  column_to_rownames("rn")
selected_ft_recip_qt <- select_features(perm_vals = perm_vals,
                                        norm_data = norm_data,
                                        threshold = 0.90,
                                        file_path = recip_path,
                                        file_name = "perm_val_PTST_90_thresh.xlsx") 

print(paste0(length(selected_ft_recip_qt),
             " metabolites were selected with a 0.90 threshold in the St Louis cohort."))
## [1] "50 metabolites were selected with a 0.90 threshold in the St Louis cohort."

We can see which of these metabolites were also selected in the Cryostem cohort to separate tolerant from non tolerant recipients:

## [1] "3 features were commonly found in both cohorts"

We can see how many of these features had the same behaviour in the two cohorts (ie, were over or underexpressed in the same group of patients)

## [1] "2 features out of the 3 common selected features have the same behaviour in both cohorts."
## [1] "X2.palmitoyl.GPC..16.0.." "X3.aminoisobutyrate"

Graph of the selected metabolites that are common between the two cohorts: The features that were correlated (ie expressed in similar ways among the recipients) are linked in the graph. The features that were more expressed in the non tolerant St Louis recipients are colored in red, while the features that were more expressed in tolerant recipients are colored in blue.

I save the common features in an excel file: The excel file contains information on which features were correlated in the graph above. It also contains, for every gene, the associated logFC, pvalue…, where the comparison is in non tolerant versus tolerant patients (eg, a positive Fold Change for a gene therefore means that the gene is overexpressed in non tolrant patients)

Comparison of all metabolites of the 2 cohorts:

metabo_cor_2_cohorts_LR(model_stat1="~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo_national/recip/perm_vals_PS.RData",
                        norm_data1 = "~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo_national/recip/norm_data_PS.RData",
                        model_stat2 = "~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/perm_vals_PS.RData", 
                        norm_data2 = "~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/recip/norm_data_PS.RData", 
                        stat_of_interest = 1, 
                        meta_metabo = meta_metabo)
## [1] "Number of metabolites in common between the two cohorts = 438"

Including gender compatibility and age recip in the feature selection

Now that we have selected metabolites that seemed to play a role in tolerance that were common to both cohorts (with a threhold over 0.90), we decide to investigate into these metabolites to see if some of them are related to the age of the recipients, to the gender compatibility between the donor and recipient, to both or to none of these two causes.

I first build new models using the metabolites that were kept in the two cohorts as informative when comparing primary and secondary tolerant patients:

new_models <- lapply(seq_along(common_features$common_features), function(idx){
  dta_tmp <- norm_data[,c(common_features$common_features[idx],
                        "group", "age_recip", "gender_comp")]
  colnames(dta_tmp)[1] <- "Var"
  fit1 <- glm(group ~ Var, data=dta_tmp,trace=F, family = "binomial")
  fit2 <- glm(group ~ gender_comp + Var,
              data=dta_tmp,trace=F, family = "binomial")
  fit3 <- glm(group ~ age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  fit4 <- glm(group ~ gender_comp + age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  list(fit1, fit2, fit3, fit4)
})
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
plots_gender_age(pdf_name = "../plots/metabo/recip/age_and_gender/prim_sec_0.9.pdf",
                 norm_data = norm_data,
                 features = common_features$common_features)
## Warning: Transformation introduced infinite values in continuous x-axis
## quartz_off_screen 
##                 2
plots_gender_age(pdf_name = "../plots/metabo/recip/age_and_gender/prim_sec_0.9_behavior.pdf",
                 norm_data = norm_data,
                 features = common_beh_features)
## quartz_off_screen 
##                 2

Analysis on the donors only :

Load in the data :

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/donors/norm_data.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/donors/meta_metabo.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/donors/big_mat.RData")

We can perform a principal components analysis on the donors to see if we would already see some structure in the data:

pca_met <- prcomp(norm_data %>% dplyr::select(-"Group"))

It looks like the two first principal components are by far the most informative:

plot(pca_met)

## Warning: Removed 4 rows containing missing values (geom_point).

I plot the FCS names of the donors, for easier comparison of the metabolomics data with the results found in the CyTOF data. The groups seem to be quite mixed in the PCA. I also colored the donors per CMV status, but there doesn’t seem to be a CMV positive group, as opposed to what was observed in the CyTOF data.

We can try to run random forest models on the donors, before applying any feature selection : But it seems like the metabolomics data isn’t sufficient to classify the donors leading to tolerance or non tolerance

## 
## Call:
##  randomForest(formula = Group ~ ., data = norm_data, mtry = 40) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 40
## 
##         OOB estimate of  error rate: 44.12%
## Confusion matrix:
##                    Non_Tolerant Primary_tolerant Secondary_tolerant
## Non_Tolerant                 19                2                  0
## Primary_tolerant              7                0                  0
## Secondary_tolerant            6                0                  0
##                    class.error
## Non_Tolerant         0.0952381
## Primary_tolerant     1.0000000
## Secondary_tolerant   1.0000000

We can try an easier classification problem: non tolerant vs tolerant patients. But it seems like the metabolomics data doesn’t allow us to classify the donors.

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tmp, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 30
## 
##         OOB estimate of  error rate: 52.94%
## Confusion matrix:
##              non_tolerant tolerant class.error
## non_tolerant           14        7   0.3333333
## tolerant               11        2   0.8461538

It seems like the metabolomics daat isn’t sufficient to classify the donors as primary or seconday tolerant neither:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tol, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 30
## 
##         OOB estimate of  error rate: 61.54%
## Confusion matrix:
##                    primary_tolerant secondary_tolerant class.error
## primary_tolerant                  4                  3   0.4285714
## secondary_tolerant                5                  1   0.8333333

Feature selection

Differences in metabolites between donors leading to tolerance or non tolerance

Run the permutation distribution, taking demographic variables into account: I first add the demographics info to the norm_data:

I then compute the permutation distribution for non vs tolerance.

Visualise the resulting quantile values:

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/donors/perm_vals_TNT.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/donors/norm_data_TNT.RData")
par(mfrow=c(2,1))
barplot(unlist(map(perm_vals, 1)))
barplot(unlist(map(perm_vals, 2)))

saveRDS(norm_data, file = paste0(donor_path, "norm_data_TNT.RDS"))
saveRDS(perm_vals, file = paste0(donor_path, "perm_vals_TNT.RDS"))

Compute the median per group for every gene, to use it later in the graphs:

gr_medians <- norm_data %>% 
  group_by(group) %>% 
  summarize_if(is.numeric, median) %>% 
  as.data.frame()
rownames(gr_medians) <- gr_medians$group

Threshold of 0.90

# I first rearrange the nor data matrix (so that I don't have to add +1 to select features)

norm_data <- norm_data %>% 
  rownames_to_column("rn") %>% 
  select(-group, everything()) %>% 
  column_to_rownames("rn")

Selected metabolites for the donors, after computing permutation distributions for every metabolite:

## [1] "112 metabolites were selected with a 0.90 threshold in the St Louis cohort."

We can see which of these metabolites were also selected in the St Louis cohort to separate tolerant from non tolerant recipients:

## [1] "12 features were commonly found in both cohorts"

We can see how many of these features had the same behaviour in the two cohorts (ie, were over or underexpressed in the same group of patients)

## [1] "6 features out of the 12 common selected features have the same behaviour in both cohorts."
## [1] "X5alpha.pregnan.3beta.20alpha.diol.disulfate"
## [2] "androstenediol..3beta.17beta..disulfate..2." 
## [3] "androsterone.sulfate"                        
## [4] "beta.alanine"                                
## [5] "lysine"                                      
## [6] "taurocholenate.sulfate"

Graph of the selected metabolites that are common between the two cohorts: The features that were correlated (ie expressed in similar ways among the recipients) are linked in the graph. The features that were more expressed in the non tolerant St Louis recipients are colored in red, while the features that were more expressed in tolerant recipients are colored in blue.

I save the common features in an excel file: The excel file contains information on which features were correlated in the graph above. It also contains, for every gene, the associated logFC, pvalue…, where the comparison is in non tolerant versus tolerant patients (eg, a positive Fold Change for a gene therefore means that the gene is overexpressed in non tolrant patients)

We can try to build a RF model based on these common features:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tmp, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 26.47%
## Confusion matrix:
##              non_tolerant tolerant class.error
## non_tolerant           16        5   0.2380952
## tolerant                4        9   0.3076923

Including gender compatibility and age recip in the feature selection

Now that we have selected metabolites that seemed to play a role in tolerance that were common to both cohorts (with a threhold over 0.90), we decide to investigate into these metabolites to see if some of them are related to the age of the recipients, to the gender compatibility between the donor and recipient, to both or to none of these two causes.

I first build new models using the metabolites that were kept in the two cohorts as informative when comparing tolerant and non tolerant patients:

new_models <- lapply(seq_along(common_features$common_features), function(idx){
  dta_tmp <- norm_data[,c(common_features$common_features[idx],
                        "group", "age_recip", "gender_comp")]
  colnames(dta_tmp)[1] <- "Var"
  fit1 <- glm(group ~ Var, data=dta_tmp,trace=F, family = "binomial")
  fit2 <- glm(group ~ gender_comp + Var,
              data=dta_tmp,trace=F, family = "binomial")
  fit3 <- glm(group ~ age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  fit4 <- glm(group ~ gender_comp + age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  list(fit1, fit2, fit3, fit4)
})
plots_gender_age(pdf_name = "../plots/metabo/donors/age_and_gender/tol_nontol_0.9.pdf",
                 norm_data = norm_data,
                 features = common_features$common_features)
## quartz_off_screen 
##                 2
plots_gender_age(pdf_name = "../plots/metabo/donors/age_and_gender/tol_nontol_0.9_behavior.pdf",
                 norm_data = norm_data,
                 features = common_beh_features)
## quartz_off_screen 
##                 2

Looking at differences between primary and secondary tolerant donors

I compute the permutation distribution for primary vs secondary tolerant donors:

Visualise the resulting quantile values: In the first plot: quantile values of the models on metabolites one by one In the second plot: quantile values of the difference between group~ age+gender and group~ age+gender+metabolite

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/donors/norm_data_PS.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/donors/perm_vals_PS.RData")
par(mfrow=c(2,1))
barplot(unlist(map(perm_vals, 1)))
barplot(unlist(map(perm_vals, 2)))

saveRDS(norm_data, file = paste0(donor_path, "norm_data_PTST.RDS"))
saveRDS(perm_vals, file = paste0(donor_path, "perm_vals_PTST.RDS"))

Compute the median per group for every gene, to use it later in the graphs:

gr_medians <- norm_data %>% 
  group_by(group) %>% 
  summarize_if(is.numeric, median) %>% 
  as.data.frame()
rownames(gr_medians) <- gr_medians$group

Threshold of 0.90

# I first rearrange the nor data matrix (so that I don't have to add +1 to select features)

norm_data <- norm_data %>% 
  rownames_to_column("rn") %>% 
  select(-group, everything()) %>% 
  column_to_rownames("rn")

Selected metabolites for the donors, after computing permutation distributions for every metabolite:

## [1] "45 metabolites were selected with a 0.90 threshold in the St Louis cohort."

We can see which of these metabolites were also selected in the Cryostem cohort to separate primary from secondary tolerant donors:

## [1] "3 features were commonly found in both cohorts"

We can see how many of these features had the same behaviour in the two cohorts (ie, were over or underexpressed in the same group of patients)

## [1] "1 features out of the 3 common selected features have the same behaviour in both cohorts."
## [1] "palmitoyl.dihydrosphingomyelin..d18.0.16.0.."

Graph of the selected metabolites that are common between the two cohorts: The features that were correlated (ie expressed in similar ways among the recipients) are linked in the graph. The features that were more expressed in the non tolerant St Louis recipients are colored in red, while the features that were more expressed in tolerant recipients are colored in blue.

I save the common features in an excel file: The excel file contains information on which features were correlated in the graph above. It also contains, for every gene, the associated logFC, pvalue…, where the comparison is in non tolerant versus tolerant patients (eg, a positive Fold Change for a gene therefore means that the gene is overexpressed in non tolrant patients)

Including gender compatibility and age recip in the feature selection

Now that we have selected metabolites that seemed to play a role in tolerance that were common to both cohorts (with a threhold over 0.90), we decide to investigate into these metabolites to see if some of them are related to the age of the recipients, to the gender compatibility between the donor and recipient, to both or to none of these two causes.

I first build new models using the metabolites that were kept in the two cohorts as informative when comparing tolerant and non tolerant patients:

new_models <- lapply(seq_along(common_features$common_features), function(idx){
  dta_tmp <- norm_data[,c(common_features$common_features[idx],
                        "group", "age_recip", "gender_comp")]
  colnames(dta_tmp)[1] <- "Var"
  fit1 <- glm(group ~ Var, data=dta_tmp,trace=F, family = "binomial")
  fit2 <- glm(group ~ gender_comp + Var,
              data=dta_tmp,trace=F, family = "binomial")
  fit3 <- glm(group ~ age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  fit4 <- glm(group ~ gender_comp + age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  list(fit1, fit2, fit3, fit4)
})
plots_gender_age(pdf_name = "../plots/metabo/donors/age_and_gender/prim_sec_0.9.pdf",
                 norm_data = norm_data,
                 features = common_features$common_features)
## quartz_off_screen 
##                 2
plots_gender_age(pdf_name = "../plots/metabo/donors/age_and_gender/prim_sec_0.9_behavior.pdf",
                 norm_data = norm_data,
                 features = common_beh_features)
## quartz_off_screen 
##                 2

Analysis on the donors - recipients :

In order to investigate into the differences between the recipients’ metabolite profile compared to the original profile of their donor, I’m running the same analysis on the donors-recipients (matched per couple of course)

We can apply a principal components analysis to see if there is some structure in the data that can already be seen:

## Adding missing grouping variables: `COUPLENUMBER`

In the following PCA plot, we plot the recipients as black dots, the donors as white dots, and we link donors and recipients that belong to same a couple by a line of the color of their group.

It looks like, in the metabolomics data of the St Louis cohort, the distances between non tolerant donors and recipients are higher than the distances between primary or secondary tolerant donors and recipients. This is similar to what we had observed in the CyTOF data of these same patients.

list_couples <- names(table(big_mat2$COUPLENUMBER))
names(distances) <- paste0("couple_",list_couples)
couple_dist_matrix <- big_mat2[,c("COUPLENUMBER", "Group")] %>% arrange(COUPLENUMBER) %>% unique()
couple_dist_matrix$Group <- as.factor(couple_dist_matrix$Group)
dis_colors <- c("red","green","blue")[couple_dist_matrix$Group]
order_dis <- order(distances)

plot(distances[order_dis], col = dis_colors[order_dis], pch=19,
     main = "Distance between donor and recipient from same couple",
     ylab = "Distance D/R", xaxt = "n", xlab = "")
axis(1, at=seq_along(list_couples), labels=FALSE)
text(x=seq_along(list_couples), y=par()$usr[3]-0.1*(par()$usr[4]-par()$usr[3]),
     labels=names(distances[order_dis]), srt=45, adj=1, xpd=TRUE, cex = .7)
legend("topleft", c("non_tolerant","primary_tolerant","secondary_tolerant"),
       col = c("red","green","blue"), pch = 19)

I can already try to classify the donors - recipients based on all the metabolites:

subst$group <- as.factor(subst$group)
colnames(subst) <- make.names(colnames(subst))

set.seed(1)
rf_subst <- randomForest::randomForest(group~., subst, mtry = 40)
rf_subst
## 
## Call:
##  randomForest(formula = group ~ ., data = subst, mtry = 40) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 40
## 
##         OOB estimate of  error rate: 38.24%
## Confusion matrix:
##                    Non_Tolerant Primary_tolerant Secondary_tolerant
## Non_Tolerant                 21                0                  0
## Primary_tolerant              6                0                  1
## Secondary_tolerant            4                2                  0
##                    class.error
## Non_Tolerant                 0
## Primary_tolerant             1
## Secondary_tolerant           1

Trying to classify between non tolerant and tolerant patients:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tmp, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 30
## 
##         OOB estimate of  error rate: 20.59%
## Confusion matrix:
##              non_tolerant tolerant class.error
## non_tolerant           19        2   0.0952381
## tolerant                5        8   0.3846154

It seems like some metabolites might help to differentiate between tolerant and non tolerant couples. Trying to classify only primary and seconday tolerant recipients:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tol, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 30
## 
##         OOB estimate of  error rate: 69.23%
## Confusion matrix:
##                    primary_tolerant secondary_tolerant class.error
## primary_tolerant                  3                  4   0.5714286
## secondary_tolerant                5                  1   0.8333333

But the information in the metabolites seems unsufficient to classify primary and secondary couples.

Feature selection

Investigating into metabolites that mainly differ between tolerant and non tolerant couples:

I first add the demographics info to the couple matrix:

I then compute the permutation distribution for non vs tolerant recipients.

Visualise the resulting quantile values:

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/r&d/perm_vals_TNT.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/r&d/norm_data_TNT.RData")
par(mfrow=c(2,1))
barplot(unlist(map(perm_vals, 1)))
barplot(unlist(map(perm_vals, 2)))

Compute the median per group for every gene, to use it later in the graphs:

gr_medians <- norm_data %>%
  group_by(group) %>%
  summarize_if(is.numeric, median) %>%
  as.data.frame()
rownames(gr_medians) <- gr_medians$group

Threshold of 0.95

Selected metabolites for the couples, with a threshold on 0.95:

qt <- unlist(map(perm_vals, 1))
densityplot(qt)

selected_metabolites_rd_qt <- colnames(norm_data)[which(qt>.95)+1]
write.xlsx(selected_metabolites_rd_qt, file = "../data/metabo/r&d/perm_val_TNT_95_thresh.xlsx")
# selected_metabolites_rd_qt
print(paste0(length(selected_metabolites_rd_qt),
             " metabolites were selected with a 0.95 threshold in the St Louis cohort."))
## [1] "90 metabolites were selected with a 0.95 threshold in the St Louis cohort."

Which of these metabolites were common in the cryostem cohort?

sel_cryo <- read.xlsx("../data/metabo_national/r&d/perm_val_TNT_95_thresh.xlsx", colNames = FALSE)
sel_common <- which(selected_metabolites_rd_qt %in% sel_cryo$X1)
length(sel_common)
## [1] 17
sel_com_names <- selected_metabolites_rd_qt[sel_common]
sel_com_pretty_names <- gsub("subst_", "", sel_com_names)
sel_com_pretty_names 
##  [1] "X1.5.anhydroglucitol..1.5.AG."                     
##  [2] "X1.palmitoleoylglycerol..16.1.."                   
##  [3] "X1.palmitoyl.2.oleoyl.GPE..16.0.18.1."             
##  [4] "X3.methyl.2.oxovalerate"                           
##  [5] "X4.methyl.2.oxopentanoate"                         
##  [6] "X5alpha.pregnan.3beta.20alpha.diol.monosulfate..2."
##  [7] "aspartate"                                         
##  [8] "cysteine.glutathione.disulfide"                    
##  [9] "isoleucine"                                        
## [10] "leucine"                                           
## [11] "N.palmitoyl.sphinganine..d18.0.16.0."              
## [12] "stearoyl.arachidonoyl.glycerol..18.0.20.4...1.."   
## [13] "taurochenodeoxycholate"                            
## [14] "taurocholate"                                      
## [15] "tryptophan"                                        
## [16] "uridine"                                           
## [17] "ximenoylcarnitine..C26.1.."

I save these common metabolites:

metabo_ft_sel_dr_TNT_95 <- norm_data[,c("group", selected_metabolites_rd_qt[sel_common])]
save(metabo_ft_sel_dr_TNT_95, 
     file = "../data/metabo/r&d/metabo_ft_sel_dr_TNT_95.RData")
write.xlsx(sel_com_pretty_names, file = "../data/metabo/r&d/excel_files_selected/metabo_ft_sel_dr_TNT_95.xlsx")

We can try to build a RF model based on the common features:

## 
## Call:
##  randomForest(formula = group ~ ., data = df_tmp, mtry = mtry,      ntree = ntree, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 20.59%
## Confusion matrix:
##              non_tolerant tolerant class.error
## non_tolerant           18        3   0.1428571
## tolerant                4        9   0.3076923

Threshold of 0.90

Selected metabolites for the couples, with a threshold on 0.95:

qt <- unlist(map(perm_vals, 1))
densityplot(qt)

selected_metabolites_rd_qt <- colnames(norm_data)[which(qt>.90)+1]
write.xlsx(selected_metabolites_rd_qt, file = "../data/metabo/r&d/perm_val_TNT_90_thresh.xlsx")
# selected_metabolites_rd_qt
print(paste0(length(selected_metabolites_rd_qt),
             " metabolites were selected with a 0.95 threshold in the St Louis cohort."))
## [1] "120 metabolites were selected with a 0.95 threshold in the St Louis cohort."

Which of these metabolites were common in the cryostem cohort?

sel_cryo <- read.xlsx("../data/metabo_national/r&d/perm_val_TNT_90_thresh.xlsx", colNames = FALSE)
sel_common <- which(selected_metabolites_rd_qt %in% sel_cryo$X1)
length(sel_common)
## [1] 32
sel_com_names <- selected_metabolites_rd_qt[sel_common]
sel_com_names
##  [1] "subst_X1.2.dilinoleoyl.GPC..18.2.18.2."                  
##  [2] "subst_X1.5.anhydroglucitol..1.5.AG."                     
##  [3] "subst_X1..1.enyl.palmitoyl..2.oleoyl.GPC..P.16.0.18.1.." 
##  [4] "subst_X1.palmitoleoylglycerol..16.1.."                   
##  [5] "subst_X1.palmitoyl.2.oleoyl.GPE..16.0.18.1."             
##  [6] "subst_X1.stearoyl.2.linoleoyl.GPE..18.0.18.2.."          
##  [7] "subst_X2.aminobutyrate"                                  
##  [8] "subst_X2.linoleoylglycerol..18.2."                       
##  [9] "subst_X3.methyl.2.oxovalerate"                           
## [10] "subst_X4.methyl.2.oxopentanoate"                         
## [11] "subst_X5alpha.pregnan.3beta.20alpha.diol.monosulfate..2."
## [12] "subst_aspartate"                                         
## [13] "subst_carnitine"                                         
## [14] "subst_cholesterol"                                       
## [15] "subst_cysteine.glutathione.disulfide"                    
## [16] "subst_EDTA"                                              
## [17] "subst_ergothioneine"                                     
## [18] "subst_glucuronate"                                       
## [19] "subst_glycochenodeoxycholate"                            
## [20] "subst_glycocholate"                                      
## [21] "subst_isoleucine"                                        
## [22] "subst_leucine"                                           
## [23] "subst_N.palmitoyl.sphinganine..d18.0.16.0."              
## [24] "subst_nicotinamide"                                      
## [25] "subst_ornithine"                                         
## [26] "subst_proline"                                           
## [27] "subst_stearoyl.arachidonoyl.glycerol..18.0.20.4...1.."   
## [28] "subst_taurochenodeoxycholate"                            
## [29] "subst_taurocholate"                                      
## [30] "subst_tryptophan"                                        
## [31] "subst_uridine"                                           
## [32] "subst_ximenoylcarnitine..C26.1.."

Save these common features:

metabo_ft_sel_rd_TNT_90 <- norm_data[, c("group", sel_com_names)]
save(metabo_ft_sel_rd_TNT_90,
     file = "../data/metabo/r&d/metabo_ft_sel_rd_TNT_90.RData")
write.xlsx(sel_com_names, file = "../data/metabo/r&d/excel_files_selected/metabo_ft_sel_donor_TNT_90.xlsx")

Plot the graph of these selected metabolites colored based on the groups where the metabolites are most expressed:

correlations <- norm_data[,sel_com_names] %>%
  as.matrix %>%
  cor %>%
  as.data.frame %>%
  rownames_to_column(var = 'var1') %>%
  gather(var2, value, -var1) %>%
  dplyr::filter(var1 < var2)

correlated_features <- correlations %>%
  dplyr::filter(value > 0.7)
gr <- graph_from_data_frame(correlated_features, directed = FALSE, vertices=sel_com_names)
nodes <- get.vertex.attribute(gr)
color_nodes <- rep("red", length(nodes$name))

medians <- gr_medians[,nodes$name]
for (gene in seq_along(colnames(medians))){
  if (medians["tolerant", gene] == max(medians[, gene])){
    color_nodes[gene] <- "blue"
  }
}

set.seed(1)

plot(gr, vertex.size = 7, vertex.label.cex = .5, edge.width = (E(gr)$value - .7) * 20,
     vertex.color = color_nodes)

Now that we have identified features that are informative in both cohorts to classify tolerant from non tolerant couples, we can try to use them to build a model to classify the couples of the St Louis cohort:

colnames(norm_data) <- make.names(colnames(norm_data))
set.seed(1)
rf1 <- rf_tol_non_tol(df_orig = norm_data[,c("group", make.names(sel_com_names))], ntree = 1000, package_2use = "ranger")
rf1$confusion.matrix
##               predicted
## true           non_tolerant tolerant
##   non_tolerant           19        2
##   tolerant                3       10
paste0("prediction error: ", rf1$prediction.error*100)
## [1] "prediction error: 14.7058823529412"

We can try to build a RF model based on the common features that had a similar behaviour between the two cohorts:

##               predicted
## true           non_tolerant tolerant
##   non_tolerant           16        5
##   tolerant                7        6
## [1] "prediction error: 35.2941176470588"

But this doesn’t seem to improve the classification

Including age and gender in the analysis

Now that we have selected features that seemed to play a role in tolerance that were common to both cohorts (with a threhold over 0.90), we decide to investigate into these features to see if some of them are related to the age of the recipients, to the gender compatibility between the donor and recipient, to both or to none of these two causes.

I first build new models using the features that were kept in the two cohorts as informative when comparing tolerant and non tolerant patients:

new_models <- lapply(seq_along(make.names(sel_com_names)), function(idx){
  dta_tmp <- norm_data[,c(make.names(sel_com_names)[idx],
                        "group", "age_recip", "gender_comp")]
  colnames(dta_tmp)[1] <- "Var"
  fit1 <- glm(group ~ Var, data=dta_tmp,trace=F, family = "binomial")
  fit2 <- glm(group ~ gender_comp + Var,
              data=dta_tmp,trace=F, family = "binomial")
  fit3 <- glm(group ~ age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  fit4 <- glm(group ~ gender_comp + age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  list(fit1, fit2, fit3, fit4)
})
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
plots_gender_age(pdf_name = "../plots/metabo/rd/age_and_gender/tol_nontol_0.9.pdf",
                 norm_data = norm_data,
                 features = make.names(sel_com_names))
## quartz_off_screen 
##                 2

Looking at differences between primary and secondary tolerant couples

I compute the permutation distribution for primary vs secondary tolerant donors:

Visualise the resulting quantile values: In the first plot: quantile values of the models on metabolites one by one In the second plot: quantile values of the difference between group~ age+gender and group~ age+gender+metabolite

load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/r&d/norm_data_PS.RData")
load("~/Documents/VIB/Projects/Integrative_Paris/Integrative/outputs/data/metabo/r&d/perm_vals_PS.RData")
par(mfrow=c(2,1))
barplot(unlist(map(perm_vals, 1)))
barplot(unlist(map(perm_vals, 2)))

Compute the median per group for every gene, to use it later in the graphs:

gr_medians <- norm_data %>%
  group_by(group) %>%
  summarize_if(is.numeric, median) %>%
  as.data.frame()
rownames(gr_medians) <- gr_medians$group

Threshold of 0.95

Selected metabolites for the couples, with a threshold on 0.95:

qt <- unlist(map(perm_vals, 1))
densityplot(qt)

selected_metabolites_rd_qt <- colnames(norm_data)[which(qt>.95)+1]
write.xlsx(selected_metabolites_rd_qt, file = "../data/metabo/r&d/perm_val_PS_95_thresh.xlsx")
# selected_metabolites_rd_qt
print(paste0(length(selected_metabolites_rd_qt),
             " metabolites were selected with a 0.95 threshold in the St Louis cohort."))
## [1] "41 metabolites were selected with a 0.95 threshold in the St Louis cohort."

Which of these metabolites were common in the cryostem cohort?

sel_cryo <- read.xlsx("../data/metabo_national/r&d/perm_val_PS_95_thresh.xlsx", colNames = FALSE)
sel_common <- which(selected_metabolites_rd_qt %in% sel_cryo$X1)
length(sel_common)
## [1] 1
sel_com_names <- selected_metabolites_rd_qt[sel_common]
sel_com_pretty_names <- gsub("subst_", "", sel_com_names)
sel_com_pretty_names 
## [1] "X2.palmitoyl.GPC..16.0.."
write.xlsx(sel_com_pretty_names, file = "../data/metabo/r&d/excel_files_selected/metabo_ft_sel_dr_PS_95.xlsx")

Threshold of 0.90

Selected metabolites for the couples, with a threshold on 0.90:

qt <- unlist(map(perm_vals, 1))
densityplot(qt)

selected_metabolites_rd_qt <- colnames(norm_data)[which(qt>.90)+1]
write.xlsx(selected_metabolites_rd_qt, file = "../data/metabo/r&d/perm_val_PS_90_thresh.xlsx")
print(paste0(length(selected_metabolites_rd_qt),
             " metabolites were selected with a 0.90 threshold in the St Louis cohort."))
## [1] "77 metabolites were selected with a 0.90 threshold in the St Louis cohort."

Which of these metabolites were common in the cryostem cohort?

sel_cryo <- read.xlsx("../data/metabo_national/r&d/perm_val_PS_90_thresh.xlsx", colNames = FALSE)
sel_common <- which(selected_metabolites_rd_qt %in% sel_cryo$X1)
length(sel_common)
## [1] 8
sel_com_names <- selected_metabolites_rd_qt[sel_common]
sel_com_pretty_names <- gsub("subst_", "", sel_com_names)
sel_com_pretty_names 
## [1] "X1.oleoyl.2.docosahexaenoyl.GPC..18.1.22.6.."
## [2] "X1.palmitoleoyl.2.linoleoyl.GPC..16.1.18.2.."
## [3] "X2.palmitoyl.GPC..16.0.."                    
## [4] "dimethylarginine..SDMA...ADMA."              
## [5] "N1.Methyl.4.pyridone.3.carboxamide"          
## [6] "sphingosine"                                 
## [7] "sphingosine.1.phosphate"                     
## [8] "succinate"

Save these common features:

metabo_ft_sel_rd_PS_90 <- norm_data[, c("group", sel_com_names)]
save(metabo_ft_sel_rd_PS_90,
     file = "../data/metabo/r&d/metabo_ft_sel_rd_PS_90.RData")
write.xlsx(sel_com_pretty_names, file = "../data/metabo/r&d/excel_files_selected/metabo_ft_sel_dr_PS_90.xlsx")

Random forest on the metabolites that are common between the donors of the 2 cohorts: Selecting features has slightly improved the classification results:

set.seed(1)
rf_d <- ranger::ranger(group~., norm_data[,c("group", selected_metabolites_rd_qt[sel_common])], num.trees = 1000,
importance = "impurity")
rf_d$confusion.matrix
##                     predicted
## true                 primary_tolerant secondary_tolerant
##   primary_tolerant                  4                  3
##   secondary_tolerant                2                  4
paste0("prediction error: ", rf_d$prediction.error*100)
## [1] "prediction error: 38.4615384615385"

Including age and gender in the analysis

Now that we have selected features that seemed to play a role in tolerance that were common to both cohorts (with a threhold over 0.90), we decide to investigate into these features to see if some of them are related to the age of the recipients, to the gender compatibility between the donor and recipient, to both or to none of these two causes.

I first build new models using the features that were kept in the two cohorts as informative when comparing tolerant and non tolerant patients:

new_models <- lapply(seq_along(selected_metabolites_rd_qt[sel_common]), function(idx){
  dta_tmp <- norm_data[,c(selected_metabolites_rd_qt[sel_common][idx],
                        "group", "age_recip", "gender_comp")]
  colnames(dta_tmp)[1] <- "Var"
  fit1 <- glm(group ~ Var, data=dta_tmp,trace=F, family = "binomial")
  fit2 <- glm(group ~ gender_comp + Var,
              data=dta_tmp,trace=F, family = "binomial")
  fit3 <- glm(group ~ age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  fit4 <- glm(group ~ gender_comp + age_recip + Var, 
              data=dta_tmp,trace=F, family = "binomial")
  list(fit1, fit2, fit3, fit4)
})
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
plots_gender_age(pdf_name = "../plots/metabo/rd/age_and_gender/tol_nontol_0.9.pdf",
                 norm_data = norm_data,
                 features = selected_metabolites_rd_qt[sel_common])
## Warning: Transformation introduced infinite values in continuous x-axis
## quartz_off_screen 
##                 2

Combining the information on couples, recipients and donors:

Now that we have selected features at the recipient level, at the donor level and at the couple level, we can try to combine these features:

Classifying non versus tolerant patients based on all the information:

We first combine the features that were selected as highly informative to classify between tolerant and non tolerant patients from the couples, recipients and donors into one big table:

We can generate a PCA and a tSNE using all these features to see how the tolerant and non tolerant patients are represented in the reduced dimensions:

The 2 first components of the PCA seem to hold quite a lot of the variability in the data:

plot(pca_all)

set.seed(1)
pca_2plot <- as.data.frame(pca_all$x) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pca_all$x))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")
samp_tmp <- samp_rd %>% filter(!is.na(METABONAME)) %>% 
  column_to_rownames("METABONAME")
samp_tmp <- samp_tmp[pca_2plot$METABONAME.x,]

pca_2plot<- pca_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(pca_2plot, aes(x = PC1, y = PC2, col = as.factor(group), label = Id.Cryostem)) + 
  geom_point() +
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

The non tolerant patients seem to be situated more on the left of the PCA, and the tolerant patients seem to be situated more to the right, which suggests that we might have extracted some information allowing us to classify the St Louis patients based on the features that we selected.

We can also generate a tSNE on the same data:

set.seed(1)
tsne_all <- Rtsne::Rtsne(pctgs_all_only, perplexity = 10)
tsne_2plot <- as.data.frame(tsne_all$Y) %>% 
  magrittr::set_colnames(c("X1", "X2")) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pctgs_all_only))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")

tsne_2plot<- tsne_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(tsne_2plot, aes(x = X1, y = X2, col = as.factor(group), 
                       label = Id.Cryostem)) + 
  geom_point() +
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

We can see if building a model using all these features combined leads to good classification results between tolerant and non tolerant patients:

pctgs_all_RF <- pctgs_all %>% 
  select(-"COUPLENUMBER", -"gender_comp", -"age_recip") %>% 
  mutate(group = as.factor(group)) %>% 
  column_to_rownames("METABONAME.x")

set.seed(1)
nb_features = ncol(pctgs_all_RF)-1

rf <- rf_tol_non_tol(df_orig = pctgs_all_RF,
                      mtry = round(sqrt(nb_features)), ntree = 1000, package_2use = "ranger")
rf$confusion.matrix
##               predicted
## true           non_tolerant tolerant
##   non_tolerant           20        1
##   tolerant                0       13
paste0("prediction error: ", rf$prediction.error*100)
## [1] "prediction error: 2.94117647058824"

The random forest model seems to have benefitted a lot of the feature selection, only one patient is misclassified. We can see which features were selected in the model to classify tolerant from non tolerant patients in the St Louis cohort:

##                                                 R_leucine 
##                                                 1.7634626 
##                                              R_isoleucine 
##                                                 1.7419745 
##              R_X1.stearoyl.2.arachidonoyl.GPC..18.0.20.4. 
##                                                 1.2177480 
##                 R_dehydroisoandrosterone.sulfate..DHEA.S. 
##                                                 0.8998627 
##           R_androstenediol..3beta.17beta..monosulfate..1. 
##                                                 0.6783015 
##                                    R_androsterone.sulfate 
##                                                 0.5353109 
##                                                  R_valine 
##                                                 0.5032239 
##                                              R_tryptophan 
##                                                 0.4883872 
##  subst_X5alpha.pregnan.3beta.20alpha.diol.monosulfate..2. 
##                                                 0.4873513 
##                               R_X4.methyl.2.oxopentanoate 
##                                                 0.4816374 
##                                 R_X3.methyl.2.oxovalerate 
##                                                 0.4444191 
##                                                 R_proline 
##                                                 0.3503888 
##                                                  R_lysine 
##                                                 0.3291789 
##                                         R_hydroxycotinine 
##                                                 0.2670781 
## R_X1..1.enyl.palmitoyl..2.arachidonoyl.GPC..P.16.0.20.4.. 
##                                                 0.2616487 
##                subst_N.palmitoyl.sphinganine..d18.0.16.0. 
##                                                 0.2469721 
##                                 R_epiandrosterone.sulfate 
##                                                 0.2450857 
##                                               R_histidine 
##                                                 0.2430979 
##                                                 R_alanine 
##                                                 0.1740174 
##                                            R_beta.alanine 
##                                                 0.1685581

And visualise the top variables in a heatmap:

mat2plot <- pctgs_all_RF[,c("group", names(sort(rf$variable.importance, decreasing = TRUE))[1:20])] %>% 
  rownames_to_column("rownames") %>% 
  arrange(group) %>% 
  column_to_rownames("rownames")

annot_patients <- as.data.frame(mat2plot$group)
rownames(annot_patients) <- rownames(mat2plot)
colnames(annot_patients) <- "group"
ann_colors <- list(group = c(non_tolerant = "indianred1", 
                             tolerant = "royalblue1"))

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = FALSE,
                   cluster_cols = FALSE,
                   main = "Heatmap with patients ordered by group",
                   scale = "column")

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = TRUE,
                   main = "Heatmap with patients clustered",
                   scale = "column")

Reminder: high values for the variables that start with “subst_” mean that the difference of expression between the donor and the recipient of the same couple was high for this specific feature.

Classifying non versus tolerant patients based only on the metabolites that were common to both cohorts AND behaved similarily:

We first combine the features that were selected as highly informative to classify between tolerant and non tolerant patients from the couples, recipients and donors into one big table:

We can generate a PCA and a tSNE using all these features to see how the tolerant and non tolerant patients are represented in the reduced dimensions:

The 2 first components of the PCA seem to hold quite a lot of the variability in the data:

plot(pca_all)

set.seed(1)
pca_2plot <- as.data.frame(pca_all$x) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pca_all$x))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")
samp_tmp <- samp_rd %>% filter(!is.na(METABONAME)) %>% 
  column_to_rownames("METABONAME")
samp_tmp <- samp_tmp[pca_2plot$METABONAME.x,]

pca_2plot<- pca_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(pca_2plot, aes(x = PC1, y = PC2, col = as.factor(group), label = Id.Cryostem)) + 
  geom_point() +
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

The non tolerant patients seem to be situated more on the left of the PCA, and the tolerant patients seem to be situated more to the right, which suggests that we might have extracted some information allowing us to classify the St Louis patients based on the features that we selected.

We can also generate a tSNE on the same data:

set.seed(1)
tsne_all <- Rtsne::Rtsne(pctgs_all_only, perplexity = 10)
tsne_2plot <- as.data.frame(tsne_all$Y) %>% 
  magrittr::set_colnames(c("X1", "X2")) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pctgs_all_only))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")

tsne_2plot<- tsne_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(tsne_2plot, aes(x = X1, y = X2, col = as.factor(group), 
                       label = Id.Cryostem)) + 
  geom_point() +
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

We can see if building a model using all these features combined leads to good classification results between tolerant and non tolerant patients:

pctgs_all_RF <- pctgs_all %>% 
  select(-"COUPLENUMBER", -"gender_comp", -"age_recip") %>% 
  mutate(group = as.factor(group)) %>% 
  column_to_rownames("METABONAME.x")

set.seed(1)
nb_features = ncol(pctgs_all_RF)-1

rf <- rf_tol_non_tol(df_orig = pctgs_all_RF,
                      mtry = round(sqrt(nb_features)), ntree = 1000, package_2use = "ranger")
rf$confusion.matrix
##               predicted
## true           non_tolerant tolerant
##   non_tolerant           20        1
##   tolerant                1       12
paste0("prediction error: ", rf$prediction.error*100)
## [1] "prediction error: 5.88235294117647"

The random forest model seems to have benefitted a lot of the feature selection, only one patient is misclassified. We can see which features were selected in the model to classify tolerant from non tolerant patients in the St Louis cohort:

##                                              R_isoleucine 
##                                                 2.0863200 
##                                                 R_leucine 
##                                                 1.9499857 
##              R_X1.stearoyl.2.arachidonoyl.GPC..18.0.20.4. 
##                                                 1.3897451 
##                 R_dehydroisoandrosterone.sulfate..DHEA.S. 
##                                                 1.2557183 
##           R_androstenediol..3beta.17beta..monosulfate..1. 
##                                                 0.9820318 
##                               R_X4.methyl.2.oxopentanoate 
##                                                 0.6674960 
##                                    R_androsterone.sulfate 
##                                                 0.6175815 
##                                                  R_valine 
##                                                 0.6096004 
##                                                 R_proline 
##                                                 0.5888949 
##                                              R_tryptophan 
##                                                 0.4845056 
##                                 R_epiandrosterone.sulfate 
##                                                 0.4478871 
##                                 R_X3.methyl.2.oxovalerate 
##                                                 0.4227094 
##                                                  R_lysine 
##                                                 0.3777751 
##                                         R_hydroxycotinine 
##                                                 0.2753636 
##                                               R_histidine 
##                                                 0.2732183 
##                                            R_beta.alanine 
##                                                 0.2592328 
## R_X1..1.enyl.palmitoyl..2.arachidonoyl.GPC..P.16.0.20.4.. 
##                                                 0.2174211 
##                                              R_asparagine 
##                                                 0.2171361 
##                                                 R_alanine 
##                                                 0.2047629 
##                                                R_tyrosine 
##                                                 0.1733827

And visualise the top variables in a heatmap:

mat2plot <- pctgs_all_RF[,c("group", names(sort(rf$variable.importance, decreasing = TRUE))[1:20])] %>% 
  rownames_to_column("rownames") %>% 
  arrange(group) %>% 
  column_to_rownames("rownames")

annot_patients <- as.data.frame(mat2plot$group)
rownames(annot_patients) <- rownames(mat2plot)
colnames(annot_patients) <- "group"
ann_colors <- list(group = c(non_tolerant = "indianred1", 
                             tolerant = "royalblue1"))

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = FALSE,
                   cluster_cols = FALSE,
                   main = "Heatmap with patients ordered by group",
                   scale = "column")

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = TRUE,
                   main = "Heatmap with patients clustered",
                   scale = "column")

Reminder: high values for the variables that start with “subst_” mean that the difference of expression between the donor and the recipient of the same couple was high for this specific feature.

Classifying primary versus secondary tolerant patients based on all the information:

We first combine the features that were selected as highly informative to classify between primary and secondary tolerant patients from the couples, recipients and donors into one big table:

We can generate a PCA and a tSNE using all these features to see how the tolerant and non tolerant patients are represented in the reduced dimensions:

pctgs_all_only <- pctgs_all %>%
  column_to_rownames("COUPLENUMBER") %>% 
  select_if(is.numeric) %>% 
  select(-"age_recip")

pca_all <- prcomp(pctgs_all_only)

The first component of the PCA seems to hold most of the variability in the data by far:

plot(pca_all)

set.seed(1)
pca_2plot <- as.data.frame(pca_all$x) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pca_all$x))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")

samp_tmp <- samp_rd %>% filter(!is.na(METABONAME)) %>% 
  column_to_rownames("METABONAME")
samp_tmp <- samp_tmp[pca_2plot$METABONAME.x,]
pca_2plot<- pca_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(pca_2plot, aes(x = PC1, y = PC2, col = as.factor(group), label = Id.Cryostem)) + 
  geom_point() +
  scale_color_manual(breaks = c("primary_tolerant", "secondary_tolerant"),
                     values = c("springgreen3", "royalblue1"))+
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

It seems like the primary and secondary patients are quite separated in the PCA.

We can also generate a tSNE on the same data:

tsne_all <- Rtsne::Rtsne(pctgs_all_only, perplexity = 3)
tsne_2plot <- as.data.frame(tsne_all$Y) %>% 
  magrittr::set_colnames(c("X1", "X2")) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pctgs_all_only))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")
tsne_2plot<- tsne_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(tsne_2plot, aes(x = X1, y = X2, col = as.factor(group), 
                       label = Id.Cryostem)) + 
  geom_point() +
  scale_color_manual(breaks = c("primary_tolerant", "secondary_tolerant"),
                     values = c("springgreen3", "royalblue1")) +
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

We can see if building a model using all these features combined leads to good classification results between primary and secondary tolerant patients:

pctgs_all_RF <- pctgs_all %>% 
  select(-"COUPLENUMBER", -"gender_comp", -"age_recip") %>% 
  mutate(group = as.factor(group)) %>% 
  column_to_rownames("METABONAME.x")

set.seed(1)
rf <- ranger::ranger(group~., data = pctgs_all_RF, num.trees = 5000, importance = "impurity")
rf$confusion.matrix
##                     predicted
## true                 primary_tolerant secondary_tolerant
##   primary_tolerant                  6                  1
##   secondary_tolerant                2                  4
paste0("prediction error: ", rf$prediction.error*100)
## [1] "prediction error: 23.0769230769231"

We can see which features were selected in the model to classify primary and secondary tolerant patients in the St Louis cohort:

##            R_X5.acetylamino.6.amino.3.methyluracil 
##                                          0.7691838 
## subst_X1.palmitoleoyl.2.linoleoyl.GPC..16.1.18.2.. 
##                                          0.6356220 
##                      subst_sphingosine.1.phosphate 
##                                          0.6012408 
##     D_palmitoyl.dihydrosphingomyelin..d18.0.16.0.. 
##                                          0.5146193 
##                       D_X3.hydroxypyridine.sulfate 
##                                          0.4231932 
##                                  subst_sphingosine 
##                                          0.4108584 
##                         R_X2.palmitoyl.GPC..16.0.. 
##                                          0.3953339 
##                              D_X3.aminoisobutyrate 
##                                          0.3757233 
##                     subst_X2.palmitoyl.GPC..16.0.. 
##                                          0.3715801 
## subst_X1.oleoyl.2.docosahexaenoyl.GPC..18.1.22.6.. 
##                                          0.3696673 
##                                    subst_succinate 
##                                          0.3205149 
##           subst_N1.Methyl.4.pyridone.3.carboxamide 
##                                          0.2891282 
##                              R_X3.aminoisobutyrate 
##                                          0.2834316 
##               subst_dimethylarginine..SDMA...ADMA. 
##                                          0.2088265

We can visualise the top variables in a heatmap:

mat2plot <- pctgs_all_RF[,c("group", names(sort(rf$variable.importance, decreasing = TRUE)))] %>% 
  rownames_to_column("rownames") %>% 
  arrange(group) %>% 
  column_to_rownames("rownames")

annot_patients <- as.data.frame(mat2plot$group)
rownames(annot_patients) <- rownames(mat2plot)
colnames(annot_patients) <- "group"
ann_colors <- list(group = c(primary_tolerant = "springgreen3", 
                             secondary_tolerant = "royalblue1"))

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = FALSE,
                   cluster_cols = FALSE,
                   main = "Heatmap with patients ordered by group",
                   scale = "column")

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = TRUE,
                   main = "Heatmap with patients clustered",
                   scale = "column")

Reminder: high values for the variables that start with “couple_” mean that the difference of expression between the donor and the recipient of the same couple was high for this specific feature.

Focusing on the features that had the same behaviour in both cohorts:

We first combine the features that were selected as highly informative to classify between primary and secondary tolerant patients from the couples, recipients and donors into one big table:

We can generate a PCA and a tSNE using all these features to see how the tolerant and non tolerant patients are represented in the reduced dimensions:

pctgs_all_only <- pctgs_all %>%
  column_to_rownames("COUPLENUMBER") %>% 
  select_if(is.numeric) %>% 
  select(-"age_recip")

pca_all <- prcomp(pctgs_all_only)

The first component of the PCA seems to hold most of the variability in the data by far:

plot(pca_all)

set.seed(1)
pca_2plot <- as.data.frame(pca_all$x) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pca_all$x))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")

samp_tmp <- samp_rd %>% filter(!is.na(METABONAME)) %>% 
  column_to_rownames("METABONAME")
samp_tmp <- samp_tmp[pca_2plot$METABONAME.x,]
pca_2plot<- pca_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(pca_2plot, aes(x = PC1, y = PC2, col = as.factor(group), label = Id.Cryostem)) + 
  geom_point() +
  scale_color_manual(breaks = c("primary_tolerant", "secondary_tolerant"),
                     values = c("springgreen3", "royalblue1"))+
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

It seems like the primary and secondary patients are quite separated in the PCA.

We can also generate a tSNE on the same data:

tsne_all <- Rtsne::Rtsne(pctgs_all_only, perplexity = 3)
tsne_2plot <- as.data.frame(tsne_all$Y) %>% 
  magrittr::set_colnames(c("X1", "X2")) %>% 
  mutate(COUPLENUMBER = as.numeric(rownames(pctgs_all_only))) %>% 
  left_join(pctgs_all, by = "COUPLENUMBER")
tsne_2plot<- tsne_2plot %>% mutate(Id.Cryostem = samp_tmp$Id.Cryostem.R)

ggplot(tsne_2plot, aes(x = X1, y = X2, col = as.factor(group), 
                       label = Id.Cryostem)) + 
  geom_point() +
  scale_color_manual(breaks = c("primary_tolerant", "secondary_tolerant"),
                     values = c("springgreen3", "royalblue1")) +
  geom_text(aes(label=Id.Cryostem),hjust=0, vjust=0)

We can see if building a model using all these features combined leads to good classification results between primary and secondary tolerant patients:

pctgs_all_RF <- pctgs_all %>% 
  select(-"COUPLENUMBER", -"gender_comp", -"age_recip") %>% 
  mutate(group = as.factor(group)) %>% 
  column_to_rownames("METABONAME.x")

set.seed(1)
rf <- ranger::ranger(group~., data = pctgs_all_RF, num.trees = 5000, importance = "impurity")
rf$confusion.matrix
##                     predicted
## true                 primary_tolerant secondary_tolerant
##   primary_tolerant                  5                  2
##   secondary_tolerant                2                  4
paste0("prediction error: ", rf$prediction.error*100)
## [1] "prediction error: 30.7692307692308"

We can see which features were selected in the model to classify primary and secondary tolerant patients in the St Louis cohort:

## subst_X1.palmitoleoyl.2.linoleoyl.GPC..16.1.18.2.. 
##                                          1.0296828 
##                      subst_sphingosine.1.phosphate 
##                                          0.8963940 
##     D_palmitoyl.dihydrosphingomyelin..d18.0.16.0.. 
##                                          0.7947093 
##                                  subst_sphingosine 
##                                          0.6909541 
##                         R_X2.palmitoyl.GPC..16.0.. 
##                                          0.6758701 
## subst_X1.oleoyl.2.docosahexaenoyl.GPC..18.1.22.6.. 
##                                          0.6508081 
##                     subst_X2.palmitoyl.GPC..16.0.. 
##                                          0.6470765 
##                              R_X3.aminoisobutyrate 
##                                          0.5834281

We can visualise the top variables in a heatmap:

mat2plot <- pctgs_all_RF[,c("group", names(sort(rf$variable.importance, decreasing = TRUE)))] %>% 
  rownames_to_column("rownames") %>% 
  arrange(group) %>% 
  column_to_rownames("rownames")

annot_patients <- as.data.frame(mat2plot$group)
rownames(annot_patients) <- rownames(mat2plot)
colnames(annot_patients) <- "group"
ann_colors <- list(group = c(primary_tolerant = "springgreen3", 
                             secondary_tolerant = "royalblue1"))

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = FALSE,
                   cluster_cols = FALSE,
                   main = "Heatmap with patients ordered by group",
                   scale = "column")

pheatmap::pheatmap(mat2plot[,-which(colnames(mat2plot)=="group")], 
                   annotation_row = annot_patients,
                   annotation_colors = ann_colors[1],
                   cluster_rows = TRUE,
                   main = "Heatmap with patients clustered",
                   scale = "column")

Reminder: high values for the variables that start with “couple_” mean that the difference of expression between the donor and the recipient of the same couple was high for this specific feature.